#include #include #include #include #include #include #include "mpi.h" /* MPI I/O BENCHMARKS This test should be run with n processors & zero to 3 arguments. These tests evaluate the performance of an MPI I/O implementation. In each run, each process reads or writes buf_size bytes (default: 1 MB) to a file. The file can be accessed contiguously (all of process 0 data, then all of process 1 data, etc.), or interleaved. In the interleaved case, block_len (default 256) bytes from process 0 will be written, then block_len bytes from process 1, etc., then a further blocken bytes from process 0, until buf_size bytes are written. The data in the output file may be evenly aligned or have a configurable offset (default is 1). The data may be accessed through collective or individual operations, and through synchronous or asynchronous/split-collective operations. COMMAND LINE: mpirun -np n ./io_test mpirun -np n ./io_test buf_size block_len offset This program has 3 optional arguments. If a first argument is given, it sets buf_size, the number of bytes each process reads or writes per test. An optional second arguments sets block_len, the block syze in bytes each process reads or writes for interleaved I/O. The third optional argument sets the offset in bytes from the beginning of the file for the offset tests. RESULTS: The results from the first run of each test are discarded. The minimum, arithmetic mean, and maximum runtime in microseconds from the remaining ITS-1 tests are displayed. Process 0 also verifies the result of each write test. All processes verify each read test. In the event of a test failure, the offending file is not erased. On message-passing systems, the entire test suite should take on the order of 5 minutes to run for the default arguments with 8 processors. On shared memory systems, the times are very low & unstable. Increasing the number of iterations (ITS) and each processor's portion of the file (buf_size) can help. */ int buf_size = 1024*1024; int block_len = 256; MPI_Offset offset = 1; char filename_pre[] = "io_test.out"; #define MAX_TESTS 100 #define ITS 11 int np; int id; MPI_Datatype cont_filetype, noncont_filetype; MPI_Datatype buftype; MPI_File fh; MPI_Status status; char *buf; char filename[20]; int verify_read(void) { int i; /* this is much simpler: each process just checks that its read buffer has the right data */ for (i = 0; i < buf_size; i++) if (buf[i] != '0' + id) return 0; return 1; } /* FIXME: The file may be too large to read into a single process. 1st fix: Read a block at a time 2nd fix: Let each process read a block at a time 3rd fix: Use MPI-IO read routines (collective) to test results */ int verify_write(int is_contig, int offset) { int f, size, blksize; char *buf, *b, *fbuf; int i, j, ret; /* process 0 reconstructs the correct data and compares it with the contents of the file */ size = buf_size; if (is_contig) blksize = buf_size; else blksize = block_len; fbuf = malloc(sizeof(*fbuf) * size * np); buf = b = malloc(sizeof(*buf) * size * np); /* skip offset bytes and slurp rest of file into fbuf */ f = open(filename, O_RDONLY); pread(f, fbuf, size * np, offset); close(f); /* regenerate correct data */ for (i = 0; i < size; i += blksize) { for (j = 0; j < np; j++) { memset(b, '0' + j, blksize); b += blksize; } } /* and compare */ ret = (strncmp(fbuf, buf, size * np) != 0); free(fbuf); free(buf); return ret; } struct test { int is_coll; int is_contig; int is_sync; int read_time[ITS]; int write_time[ITS]; long dataSize; int offset; char desc[100]; }; struct test tests[MAX_TESTS]; int ntests; void do_abort(char* f) { fprintf(stderr, "%s\n", f); MPI_Abort(MPI_COMM_WORLD, -2); } void add_test(int is_coll, int is_contig, int offset, int is_sync) { if (ntests == MAX_TESTS && id == 0) do_abort("increase MAX_TESTS"); tests[ntests].is_coll = is_coll; tests[ntests].is_contig = is_contig; tests[ntests].offset = offset; tests[ntests].is_sync = is_sync; snprintf(tests[ntests].desc, sizeof(tests[ntests].desc), "%s %s %s offset %5d", is_coll ? "collective" : "individual", is_contig ? "contig " : "interleaved", is_sync ? "sync " : "async", offset); ntests++; } void run_tests(void) { int t, it; double t1, t2; int fn2 = 0; /* Each process keeps a buffer of buf_size bytes filled with the character '0' + rank. For ranks 0-9, this byte is simply its rank in ASCII. E.g., process 2 writes '2'. For ranks 10 and above, it will be a less useful character. This can be useful for debugging. On the first iteration of each test, this buffer is written to a file, verified by process zero, then read back and verified by all. The timing is discarded. On all subsequent iterations, no checking is performed and the times are kept by process 0. */ for (it = 0; it < ITS; it++) { for (t = 0; t < ntests; t++) { /* WRITE THEN READ FILE */ int mode = MPI_MODE_CREATE | MPI_MODE_WRONLY; int offset = 0; int bufsize; MPI_Datatype ftype = tests[t].is_contig ? cont_filetype : noncont_filetype; snprintf(filename, sizeof(filename), "%s%d", filename_pre, fn2++); MPI_Barrier(MPI_COMM_WORLD); MPI_File_delete( filename, MPI_INFO_NULL ); MPI_Barrier(MPI_COMM_WORLD); MPI_File_open(MPI_COMM_WORLD, filename, mode, MPI_INFO_NULL, &fh); /* the offset & type (contiguous or interleaved) is set in the view */ MPI_File_set_view(fh, tests[t].offset, MPI_CHAR, ftype, "native", MPI_INFO_NULL); MPI_Type_size( buftype, &bufsize ); tests[t].dataSize = bufsize; MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); /* the sync/async & collective/individual options are handled with branches */ if (tests[t].is_coll) { if (tests[t].is_sync) { MPI_File_write_at_all(fh, offset, buf, 1, buftype, &status); } else { MPI_File_write_at_all_begin(fh, offset, buf, 1, buftype); MPI_File_write_at_all_end(fh, buf, &status); } } else { if (tests[t].is_sync) { MPI_File_write_at(fh, offset, buf, 1, buftype, &status); } else { MPI_Request req; MPI_File_iwrite_at(fh, offset, buf, 1, buftype, &req); MPI_Wait(&req, &status); } } MPI_Barrier(MPI_COMM_WORLD); /* Ensure that the data is written out to the file system. */ MPI_File_sync( fh ); t2 = MPI_Wtime(); if (id == 0) { int tt; tt = (t2 - t1) * 1000000; if (it == 0) { if (verify_write(tests[t].is_contig, tests[t].offset)) { char s[200]; snprintf(s, sizeof(s), "file incorrect for test: %s", tests[t].desc); do_abort(s); } } tests[t].write_time[it] = tt; } MPI_File_close(&fh); mode = MPI_MODE_RDONLY; offset = 0; MPI_File_open(MPI_COMM_WORLD, filename, mode, MPI_INFO_NULL, &fh); /* the offset & type (contiguous or interleaved) is set in the view */ MPI_File_set_view(fh, tests[t].offset, MPI_CHAR, ftype, "native", MPI_INFO_NULL); MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); /* the sync/async & collective/individual options are handled with branches */ if (tests[t].is_coll) { if (tests[t].is_sync) { MPI_File_read_at_all(fh, offset, buf, 1, buftype, &status); } else { MPI_File_read_at_all_begin(fh, offset, buf, 1, buftype); MPI_File_read_at_all_end(fh, buf, &status); } } else { if (tests[t].is_sync) { MPI_File_read_at(fh, offset, buf, 1, buftype, &status); } else { MPI_Request req; MPI_File_iread_at(fh, offset, buf, 1, buftype, &req); MPI_Wait(&req, &status); } } MPI_Barrier(MPI_COMM_WORLD); t2 = MPI_Wtime(); MPI_File_close(&fh); if (it == 0 && verify_read() == 0) { char s[200]; snprintf(s, sizeof(s), "data read incorrect for test: %s", tests[t].desc); do_abort(s); } if (id == 0) { int tt; tt = (t2 - t1) * 1000000; tests[t].read_time[it] = tt; MPI_File_delete( filename, MPI_INFO_NULL ); } } } } void print_results(void) { int min, max, sum, i, j, np; double rateEach, rateAgg; MPI_Comm_size( MPI_COMM_WORLD, &np ); if (id == 0) { printf("%40s\tMIN\tAVG\tMAX (us/run)\n", "TEST"); for (i = 0; i < ntests; i++) { min = max = sum = tests[i].read_time[1]; for (j = 2; j < ITS; j++) { if (tests[i].read_time[j] < min) min = tests[i].read_time[j]; if (tests[i].read_time[j] > max) max = tests[i].read_time[j]; sum += tests[i].read_time[j]; } rateEach = tests[i].dataSize / (1.0*max); rateAgg = rateEach * np; printf("%40s read:\t%6d\t%6d\t%7d\t%f\t%f\n", tests[i].desc, min, sum/(ITS-1), max, rateEach, rateAgg ); } for (i = 0; i < ntests; i++) { min = max = sum = tests[i].write_time[1]; for (j = 2; j < ITS; j++) { if (tests[i].write_time[j] < min) min = tests[i].write_time[j]; if (tests[i].write_time[j] > max) max = tests[i].write_time[j]; sum += tests[i].write_time[j]; } rateEach = tests[i].dataSize / (1.0*max); rateAgg = rateEach * np; printf("%s write:\t%6d\t%6d\t%7d\t%f\t%f\n", tests[i].desc, min, sum/(ITS-1), max, rateEach, rateAgg ); } } } void usage_and_die(char* arg) { if (id == 0) { fprintf(stderr, "usage:\n\tmpirun -np n %s [buf_size] [block_len] [offset]\nall parameters optional\nbuf_size must be a multiple of block_len\n", arg); } MPI_Finalize(); exit(1); } int main( int argc, char *argv[] ) { int length[ 3 ]; MPI_Aint disp[ 3 ]; MPI_Datatype type[ 3 ]; int is_coll, is_contig, is_sync, is_offset; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &id); MPI_Comm_size(MPI_COMM_WORLD, &np); if (argc > 1) { char* end_p; buf_size = strtol(argv[1], &end_p, 10); if (*end_p != '\0') usage_and_die(argv[0]); } if (argc > 2) { char* end_p; block_len = strtol(argv[2], &end_p, 10); if (*end_p != '\0') usage_and_die(argv[0]); } if (argc > 3) { char* end_p; offset = strtol(argv[3], &end_p, 10); if (*end_p != '\0') usage_and_die(argv[0]); } if (buf_size / block_len * block_len != buf_size) do_abort("buf_size must be multiple of block_len"); if (block_len <= 0 || buf_size <= 0 || offset < 0) do_abort("bad arguments"); if (id == 0) printf("buf_size: %d, block_len: %d, offset: %5d\n\n", buf_size, block_len, (int) offset); /* the buffer is contiguous data */ buf = malloc(buf_size); memset(buf, '0' + id, buf_size); MPI_Type_contiguous(buf_size, MPI_CHAR, &buftype); MPI_Type_commit(&buftype); /* this filetypes interleaves block_len bytes at a time from each process until each has written buf_size bytes */ length[ 0 ] = 1; length[ 1 ] = block_len; length[ 2 ] = 1; disp[ 0 ] = 0; disp[ 1 ] = block_len * id; disp[ 2 ] = block_len * np; type[ 0 ] = MPI_LB; type[ 1 ] = MPI_CHAR; type[ 2 ] = MPI_UB; MPI_Type_struct(3, length, disp, type, &noncont_filetype); MPI_Type_commit(&noncont_filetype); length[ 0 ] = 1; length[ 1 ] = buf_size; length[ 2 ] = 1; disp[ 0 ] = 0; disp[ 1 ] = buf_size * id; disp[ 2 ] = buf_size * np; type[ 0 ] = MPI_LB; type[ 1 ] = MPI_CHAR; type[ 2 ] = MPI_UB; /* this filetype has all block_len bytes from each process written sequentially */ MPI_Type_struct(3, length, disp, type, &cont_filetype); MPI_Type_commit(&cont_filetype); for (is_coll = 0; is_coll <= 1; is_coll++) for (is_contig = 0; is_contig <= 1; is_contig++) for (is_offset = 0; is_offset <= 1; is_offset++) for (is_sync = 1; is_sync >= 0; is_sync--) add_test(is_coll, is_contig, is_offset ? offset : 0, is_sync); run_tests(); print_results(); MPI_Type_free(&buftype); MPI_Type_free(&cont_filetype); free(buf); MPI_Finalize(); exit(0); }